Dataset downloaded from Arkinglab website in the Transcriptome analysis reveals dysregulation of innate immune response genes and neuronal activity-dependent genes in autism section.
# Load csvs
datExpr = read.delim('./../Data/datExpr.csv')
datMeta = read.delim('./../Data/datPheno.csv')
# Create dataset with gene information
datGenes = data.frame('Ensembl_ID' = substr(datExpr$Gene, 1, 15),
'gene_name' = substring(datExpr$Gene, 17))
rownames(datExpr) = datGenes$Ensembl_ID
datExpr$Gene = NULL
### CLEAN METADATA DATA FRAME
datMeta = datMeta %>% dplyr::select('ID', 'case', 'sampleid', 'brainregion', 'positiononplate',
'Gender', 'Age', 'SiteHM', 'RIN', 'PMI', 'Dx')
datMeta$brainregion = substr(datMeta$ID, 1, 4)
datMeta = datMeta %>% mutate(brain_lobe = ifelse(brainregion=='ba19', 'Occipital', 'Frontal'),
Diagnosis = ifelse(Dx=='Autism', 'ASD', 'CTL'))
# Convert Diagnosis variable to factor
datMeta$Diagnosis = factor(datMeta$Diagnosis, levels=c('CTL','ASD'))
# sampleid is actually subject ID
datMeta = datMeta %>% dplyr::rename(Subject_ID = sampleid)
# SFARI Genes
SFARI_genes = read_csv('./../../../Models/FirstPUModel/Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
Data description taken from the paper Transcriptome analysis reveals dysregulation of innate immune response genes and neuronal activity-dependent genes in autism:
Transcriptomes from 104 human brain cortical tissue samples were resolved using next-generation RNA sequencing technology at single-gene resolution and through co-expressing gene clusters or modules. Multiple cortical tissues corresponding to Brodmann Area 19 (BA19), Brodmann Area 10 (BA10) and Brodmann Area 44 (BA44) were sequenced in 62, 14 and 28 samples, respectively, resulting in a total of 57 (40 unique individuals) control and 47 (32 unique individuals) autism samples.
Note: They analysed all of the regions together
Brain tissue: Frozen brain samples were acquired through the Autism Tissue Program, with samples originating from two different sites: the Harvard Brain Tissue Resource Center and the NICHD Brain and Tissue Bank at the University of Maryland (Gandal’s data were obtained also from the Autism Tissue Program, specifically from the Harvard Brain Bank)
Sequenced using Illumina’s HiSeq 2000 (Gandal used Illumina HiSeq 2500) Check if they are compatible
print(paste0('Dataset includes ', nrow(datExpr), ' genes from ', ncol(datExpr), ' samples belonging to ', length(unique(datMeta$Subject_ID)), ' different subjects.'))
## [1] "Dataset includes 62069 genes from 120 samples belonging to 72 different subjects."
# In the paper they talk about an original number of 110 samples and dropping 6 because of low gene coverage, resulting in 104 samples (which are the ones that are included in datMeta), but the expression dataset has 120 samples.
no_metadata_samples = colnames(datExpr)[! colnames(datExpr) %in% datMeta$ID]
no_metadata_subjects = unique(substring(no_metadata_samples, 6))
add_metadata_subjects = no_metadata_subjects[no_metadata_subjects %in% datMeta$Subject_ID]
add_metadata_samples = no_metadata_samples[grepl(paste(add_metadata_subjects, collapse='|'),
no_metadata_samples)]
for(sample in add_metadata_samples){
new_row = datMeta %>% filter(Subject_ID == strsplit(sample,'\\.')[[1]][2]) %>% dplyr::slice(1) %>%
mutate(ID = sample,
brainregion = strsplit(sample,'\\.')[[1]][1],
brain_lobe = ifelse(strsplit(sample,'\\.')[[1]][1]=='ba19','Occipital','Frontal'))
datMeta = rbind(datMeta, new_row)
}
rm(no_metadata_subjects, no_metadata_samples, add_metadata_subjects, add_metadata_samples, sample, new_row)
# And remove the samples that have no metadata and don't have any other samples that do have metadata.
keep = substring(colnames(datExpr), 6) %in% datMeta$Subject_ID
datExpr = datExpr[,keep]
# Match order of datExpr columns and datMeta rows
datMeta = datMeta[match(colnames(datExpr), datMeta$ID),]
# Check they are in the same order
if(!all(colnames(datExpr) == datMeta$ID)){
cat('order of samples don\'t match between datExpr and datMeta!')
}
rm(keep)
#######################################################################################################
# Annotate genes with BioMart information
# Cannot find 1580 ensembl ids using the archive that finds the largest number of genes is feb2014 (I cannot find the missing ones in any other archive version).
datGenes_original = datGenes
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org')
# Getting gene info using Ensembl IDs
datGenes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datGenes = datGenes[match(rownames(datExpr), datGenes$ensembl_gene_id),]
datGenes$length = datGenes$end_position-datGenes$start_position
missing_genes = datGenes_original$gene_name[is.na(datGenes$ensembl_gene_id)]
SFARI_genes_lost = missing_genes[missing_genes %in% SFARI_genes$`gene-symbol` &
!missing_genes %in% datGenes$external_gene_id] %>% unique
# Remove genes that did not return any results in bioMart
datGenes = datGenes %>% filter(!is.na(ensembl_gene_id))
datExpr = datExpr[rownames(datExpr) %in% datGenes$ensembl_gene_id,]
rm(getinfo, mart, datGenes_original)
#######################################################################################################
# Filtering
# 1. Filter genes with start or end position missing
to_keep = !is.na(datGenes$length)
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datGenes) = datGenes$ensembl_gene_id
# 2. Filter genes that do not encode any protein
if(!all(rownames(datExpr)==rownames(datGenes))) print('!!! gene rownames do not match!!!')
to_keep = datGenes$gene_biotype=='protein_coding'
datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)
rownames(datExpr) = datGenes$ensembl_gene_id
rownames(datGenes) = datGenes$ensembl_gene_id
# 3. Filter genes with low expression levels
# 3.1 Remove genes with zero expression in all of the samples
to_keep = rowSums(datExpr)>0
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
print(paste0('Removed ', sum(!to_keep), ' genes, ', sum(to_keep), ' remaining'))
## [1] "Removed 2811 genes, 19552 remaining"
print(paste0(length(unique(SFARI_genes$`gene-symbol`[SFARI_genes$ID %in% rownames(datExpr)])), ' SFARI genes remaining'))
## [1] "968 SFARI genes remaining"
# Save datasets before level of expression filtering
datExpr_original = datExpr
datGenes_original = datGenes
datMeta_original = datMeta
Filtering outlier samples (there aren’t many)
Creating DESeq object and normalising using vst transformation
thresholds = c(0, 0.5, 1, seq(2,10), 12, 15, 17, 20, 30, 40, 50)
for(threshold in thresholds){
datMeta = datMeta_original
datExpr = datExpr_original
datGenes = datGenes_original
cat(paste0('\n\nFiltering with threshold: ', threshold,'\n'))
to_keep = rowMeans(datExpr)>threshold
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
# Filter outlier samples
absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))
to_keep = z.ku >= -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
cat(paste0('Removing ', sum(!to_keep), ' samples\n'))
rm(absadj, netsummary, ku, z.ku, to_keep)
# Create a DeseqDataSet object, estimate the library size correction and save the normalized counts matrix
counts = datExpr %>% as.matrix
rowRanges = GRanges(datGenes$chromosome_name,
IRanges(datGenes$start_position, width=datGenes$length),
strand=datGenes$strand,
feature_id=datGenes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design =~Diagnosis)
# Perform vst
vsd = vst(dds)
datExpr_vst = assay(vsd)
datMeta_vst = colData(vsd)
datGenes_vst = rowRanges(vsd)
rm(counts, rowRanges, se, vsd)
# Save summary results in dataframe
if(threshold == thresholds[1]){
mean_vs_sd_data = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
} else {
new_entries = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
mean_vs_sd_data = rbind(mean_vs_sd_data, new_entries)
}
}
##
##
## Filtering with threshold: 0
## alpha: 1.000000
## Removing 4 samples
##
##
## Filtering with threshold: 0.5
## alpha: 1.000000
## Removing 4 samples
##
##
## Filtering with threshold: 1
## alpha: 1.000000
## Removing 4 samples
##
##
## Filtering with threshold: 2
## alpha: 1.000000
## Removing 3 samples
##
##
## Filtering with threshold: 3
## alpha: 1.000000
## Removing 2 samples
##
##
## Filtering with threshold: 4
## alpha: 1.000000
## Removing 2 samples
##
##
## Filtering with threshold: 5
## alpha: 1.000000
## Removing 2 samples
##
##
## Filtering with threshold: 6
## alpha: 1.000000
## Removing 2 samples
##
##
## Filtering with threshold: 7
## alpha: 1.000000
## Removing 2 samples
##
##
## Filtering with threshold: 8
## alpha: 1.000000
## Removing 2 samples
##
##
## Filtering with threshold: 9
## alpha: 1.000000
## Removing 2 samples
##
##
## Filtering with threshold: 10
## alpha: 1.000000
## Removing 2 samples
##
##
## Filtering with threshold: 12
## alpha: 1.000000
## Removing 2 samples
##
##
## Filtering with threshold: 15
## alpha: 1.000000
## Removing 2 samples
##
##
## Filtering with threshold: 17
## alpha: 1.000000
## Removing 2 samples
##
##
## Filtering with threshold: 20
## alpha: 1.000000
## Removing 2 samples
##
##
## Filtering with threshold: 30
## alpha: 1.000000
## Removing 2 samples
##
##
## Filtering with threshold: 40
## alpha: 1.000000
## Removing 3 samples
##
##
## Filtering with threshold: 50
## alpha: 1.000000
## Removing 3 samples
rm(new_entries)
to_keep_1 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean<6] %>%
as.character
to_keep_2 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean>=6]
to_keep_2 = to_keep_2 %>% sample(round(length(to_keep_2)/20)) %>% as.character
plot_data = mean_vs_sd_data[mean_vs_sd_data$ID %in% c(to_keep_1, to_keep_2),]
Note: Plotting all of the genes is too heavy, so I’m going to filter most of the genes with the highest levels of expression because we care about the behaviour of the genes with low levels
The worst part of the left tail disappears relatively quickly (threshold of around 3 or 4), but it doesn’t seem to disappear completely until a threshold of over 20 or 30 (which is too high to use because we would lose too many genes).
This general “s” pattern cannot be found in Gandal’s dataset, where the genes seem to be much more homoscedastic and the problematic pattern is found only in a small group of genes with very low levels of expression.
ggplotly(plot_data %>% ggplot(aes(Mean, SD)) +
geom_point(color='#0099cc', alpha=0.2, aes(id=ID, frame=threshold)) +
scale_x_log10() + scale_y_log10() + theme_minimal())